#######################################################################################
### THIS SCRIPT READS ATTA SENSOR NODE DATA FILES, COMPILES THEM, FILLES MISSING    ###
### LINES, REMOVES DOUBLES AND SORTS. DATA IS WRITTEN TO FILE, COMPILED INTO DAILY  ###
### SUMMARIES AND WRITTEN TO FILE (TAB SEPERATED FOR EXCEL)                         ###
#######################################################################################
#######################################################################################
### Clear all variables and set working directory					                        	###
#######################################################################################
rm(list=ls(all=TRUE))
HomeDirectory = getwd()                                                 ### this is where the R script lives
setwd("..")
ReadDirectory = file.path(getwd(),"NewNest3_Level0")                    ### this is where the CSV files live NOT NEEDED? AS WE USE SYS.GLOB
WriteDirectory = file.path(getwd(),"NewNest3_Level1")                   ### here we'll write the summaries to
#######################################################################################
### Make list all data file paths 							                                    ###
### Note we moved most older files in a subdir as some have less columns (download  ###
### without TS)                                                                     ###
#######################################################################################
FilePaths = Sys.glob( file.path( getwd(),"NewNest3_Level0/*Nest 3_Table1*.*" ) )
length(FilePaths)
#############################################################################################
### read logger data                     								                                  ###
### we initiate a df reading the first file no header, then append all other files        ###
### note CR1000 headers are 4 lines with the second line containing the variable names    ###
### we'll skip 4 lines and explicitely assign the variable names                          ###
#############################################################################################
DataLogger = read.csv( FilePaths[1], stringsAsFactors=FALSE, header=FALSE, skip=4, na.string=c("","NAN") )    ### read first file
for ( i in 2:length(FilePaths) )
{
DataLogger = rbind( DataLogger, read.csv( FilePaths[i], stringsAsFactors=FALSE, header=FALSE, skip=4, na.string=c("","NAN") ) )
}
##FilePaths[i]
str(DataLogger)
summary(DataLogger)                 ### get to know your data
names(DataLogger)                   ### check names and rename variables
names(DataLogger) = c( "TS", "RN", "VWC_2cm_avg", "T_2cm_avg", "VWC_8cm_avg", "T_8cm_avg",
"VWC_16cm_avg", "T_16cm_avg","VWC_50cm_avg", "T_50cm_avg",
"O2_1_mV", "O2_2_mV", "O2_3_mV", "O2_4_mV",
"O2_1_kPa_avg", "O2_2_kPa_avg", "O2_3_kPa_avg", "O2_4_kPa_avg",
"TC_1_avg", "TC_2_avg", "TC_3_avg", "TC_4_avg",
"CO2_1_avg", "CO2_2_avg", "CO2_3_avg", "CO2_4_avg"
)
DataLogger$TS = as.POSIXct( DataLogger$TS, tz="UTC" )                ### make date string POSIXct (UTC to avoid daylight saving issues, but values are local times)
str(DataLogger)
summary(DataLogger)
summary(DataLogger$TS)
plot(DataLogger$TS)               ### check overlap TSs
i
ReadDirectory = file.path(getwd(),"NewNest3_Level0")                    ### this is where the CSV files live NOT NEEDED? AS WE USE SYS.GLOB
WriteDirectory = file.path(getwd(),"NewNest3_Level1")                   ### here we'll write the summaries to
#######################################################################################
### Make list all data file paths 							                                    ###
### Note we moved most older files in a subdir as some have less columns (download  ###
### without TS)                                                                     ###
#######################################################################################
FilePaths = Sys.glob( file.path( getwd(),"NewNest3_Level0/*Nest 3_Table1*.*" ) )
length(FilePaths)
#############################################################################################
### read logger data                     								                                  ###
### we initiate a df reading the first file no header, then append all other files        ###
### note CR1000 headers are 4 lines with the second line containing the variable names    ###
### we'll skip 4 lines and explicitely assign the variable names                          ###
#############################################################################################
DataLogger = read.csv( FilePaths[1], stringsAsFactors=FALSE, header=FALSE, skip=4, na.string=c("","NAN") )    ### read first file
for ( i in 2:length(FilePaths) )
{
DataLogger = rbind( DataLogger, read.csv( FilePaths[i], stringsAsFactors=FALSE, header=FALSE, skip=4, na.string=c("","NAN") ) )
}
str(DataLogger)                     ### get to know your data
summary(DataLogger)                 ### get to know your data
names(DataLogger)                   ### check names and rename variables
names(DataLogger) = c( "TS", "RN", "VWC_2cm_avg", "T_2cm_avg", "VWC_8cm_avg", "T_8cm_avg",
"VWC_16cm_avg", "T_16cm_avg","VWC_50cm_avg", "T_50cm_avg",
"O2_1_mV", "O2_2_mV", "O2_3_mV", "O2_4_mV",
"O2_1_kPa_avg", "O2_2_kPa_avg", "O2_3_kPa_avg", "O2_4_kPa_avg",
"TC_1_avg", "TC_2_avg", "TC_3_avg", "TC_4_avg",
"CO2_1_avg", "CO2_2_avg", "CO2_3_avg", "CO2_4_avg"
)
DataLogger$TS = as.POSIXct( DataLogger$TS, tz="UTC" )                ### make date string POSIXct (UTC to avoid daylight saving issues, but values are local times)
str(DataLogger)
summary(DataLogger)
#############################################################################################
### cut data to bounderies first and last complete day if needed	                        ###
### 						                                                                          ###
### the first TS belongs to the previous day								                              ###
### the last day of the last data block is incomplete as data for the last half hour	    ###
### is dated the next day (next 6 month block)								                            ###
### then fill missing lines and remove doubles (based on TS)					                    ###
### 															                                                        ###
### read in another block to complete the last day						                          	###
###															                                                          ###
#############################################################################################
summary(DataLogger$TS)
plot(DataLogger$TS)               ### check overlap TSs
DataLogger = DataLogger[which(DataLogger$TS > as.POSIXct("2016-02-24 00:00", tz="UTC") &
DataLogger$TS <= as.POSIXct("2016-10-28 00:00", tz="UTC")),]
StartTS = min(DataLogger$TS)					    ### start TS dataset
EndTS = max(DataLogger$TS)    					  ### end TS dataset
AllTS = seq(StartTS, EndTS, 300) 		### vector all timestamps between min and max at 300 sec (5 min) intervals
EmptyDataFrame = array(dim=c(length(AllTS),ncol(DataLogger)-1)) 	  ### make empty array, 1 column less then Data to add TS
EmptyDataFrame = data.frame(AllTS,EmptyDataFrame) 				          ### dataframe for gapfilled data with same structure raw data
names(EmptyDataFrame) = names(DataLogger)					                  ### set names of empty data frame
DataLogger = rbind(DataLogger, EmptyDataFrame)     			            ### append empty dataframe to data
RowsToRemove = duplicated(DataLogger$TS, fromLast=FALSE) 		        ### rows with duplicate TS are removed starting from first line
RowsToKeep = !(RowsToRemove)								                        ### rows to keep
DataLogger = DataLogger[which(RowsToKeep),]				                  ### kept rows data
DataLogger = DataLogger[order(DataLogger$TS),]		                  ### sort to TS
rm(EmptyDataFrame, AllTS, StartTS, EndTS, RowsToKeep, RowsToRemove)
plot(DataLogger$TS)                                   ### check overlap TSs again
which(is.na(DataLogger$RN))				                    ### missing record number as proxi missing records
DataLogger$TS[which(is.na(DataLogger$RN))]            ### TSs of potentially missing records
str(DataLogger)                                       ### structure compiled object
names(DataLogger)
plot( DataLogger$TS, DataLogger$VWC_2cm_avg,ylim=c(0.45,0.55), type="l" )
DataLogger$VWC_2cm_avg[ which( DataLogger$VWC_2cm_avg==7999 ) ] = 0.52
DataLogger$VWC_8cm_avg[ which( DataLogger$VWC_8cm_avg==7999 ) ] = 0.52
DataLogger$VWC_16cm_avg[ which( DataLogger$VWC_16cm_avg==7999 ) ] = 0.52
DataLogger$VWC_50cm_avg[ which( DataLogger$VWC_50cm_avg==7999 ) ] = 0.52
summary(DataLogger)
str(DataLogger)
#############################################################################################
### calculate Julian day (DOY), month and year for all data  					                    ###
### this makes it easy to compile data into daily and monthly averages/totals later on    ###
#############################################################################################
TempTS <- as.POSIXlt(DataLogger$TS - 300)  			            ### move data half hour so that midnight gets DOY of right day
### the TS is now the start of the 5min interval data was collected
Year <- TempTS$year + 1900					                        ### extract the year
Month <- TempTS$mon + 1                                     ### extract the month
DOY <- TempTS$yday + 1						                          ### extract day of year
DataLogger <- cbind( DataLogger[1]-300, Year, Month, DOY, DataLogger[,2:26])	  ### insert in dataset
rm(TempTS, Year, Month, DOY)
str(DataLogger)
#############################################################################################
### This concludes preparation of the sensor node dataset		    		                      ###
### NOTE THAT BASIC DATA QCQA NEEDS IMPLEMENTATION                                        ###
### Write to file	(located in directory script)	                                          ###
#############################################################################################
setwd(WriteDirectory)
write.csv( DataLogger, file = "NewNest3_Feb2016_Nov2016.dat", row.names = FALSE )
write.table( DataLogger, file = "NewNest3_Feb2016_Nov2016.xls", row.names = FALSE, sep = "\t" )
############################################################################################
### NOW WE WILL CREATE DAILY AND MONTHLY DATA SUMMARIES                                  ###
### NAs are not removed from dataset which means they                             	     ###
### propagate through to the daily values								 	                               ###
### Use na.rm=TRUE to deal differently with this 								                         ###
############################################################################################
Date <- trunc(DataLogger$TS, units="days")  			                  ### date used for daysums df
DataLogger <- cbind(DataLogger[1], Date, DataLogger[,2:29])         ### insert in dataset
rm(Date)
str(DataLogger)
DailyData = unique(DataLogger[,2:5])                               ### init df with unique Dates, Year, Month, DOY
DataLogger$UniqueDays = DataLogger$Year * 1000 + DataLogger$DOY    ### unique variable defining days (DOY repeats itself throughout years)
DailyData$VWC_2cm_avg = as.numeric( tapply( DataLogger$VWC_2cm_avg, DataLogger$UniqueDays, mean ) )            ### daily average VWC 2cm
DailyData$T_2cm_avg <- as.numeric( tapply( DataLogger$T_2cm_avg, DataLogger$UniqueDays, mean ) )  	           ### daily average Tsoil 2cm
DailyData$VWC_8cm_avg = as.numeric( tapply( DataLogger$VWC_8cm_avg, DataLogger$UniqueDays, mean ) )            ### daily average VWC 8cm
DailyData$T_8cm_avg <- as.numeric( tapply( DataLogger$T_8cm_avg, DataLogger$UniqueDays, mean ) )               ### daily average Tsoil 8cm
DailyData$VWC_16cm_avg = as.numeric( tapply( DataLogger$VWC_16cm_avg, DataLogger$UniqueDays, mean ) )          ### daily average VWC 16cm
DailyData$T_16cm_avg <- as.numeric( tapply( DataLogger$T_16cm_avg, DataLogger$UniqueDays, mean ) )             ### daily average Tsoil 16cm
DailyData$VWC_50cm_avg = as.numeric( tapply( DataLogger$VWC_50cm_avg, DataLogger$UniqueDays, mean ) )          ### daily average VWC 50cm
DailyData$T_50cm_avg <- as.numeric( tapply( DataLogger$T_50cm_avg, DataLogger$UniqueDays, mean ) )             ### daily average Tsoil 50cm
DailyData$O2_1_kPa_avg = as.numeric( tapply( DataLogger$O2_1_kPa_avg, DataLogger$UniqueDays, mean ) )
DailyData$O2_2_kPa_avg = as.numeric( tapply( DataLogger$O2_2_kPa_avg, DataLogger$UniqueDays, mean ) )
DailyData$O2_3_kPa_avg = as.numeric( tapply( DataLogger$O2_3_kPa_avg, DataLogger$UniqueDays, mean ) )
DailyData$O2_4_kPa_avg = as.numeric( tapply( DataLogger$O2_4_kPa_avg, DataLogger$UniqueDays, mean ) )
DailyData$CO2_1_avg = as.numeric( tapply( DataLogger$CO2_1_avg, DataLogger$UniqueDays, mean ) )
DailyData$CO2_2_avg = as.numeric( tapply( DataLogger$CO2_2_avg, DataLogger$UniqueDays, mean ) )
DailyData$CO2_3_avg = as.numeric( tapply( DataLogger$CO2_3_avg, DataLogger$UniqueDays, mean ) )
DailyData$CO2_4_avg = as.numeric( tapply( DataLogger$CO2_4_avg, DataLogger$UniqueDays, mean ) )
#############################################################################################
### This concludes preparation of the sensor node DAILY dataset  		                      ###
### NOTE THAT BASIC DATA QCQA NEEDS IMPLEMENTATION AT 5MIN LEVEL                          ###
### Write to file	                                                                        ###
#############################################################################################
setwd(WriteDirectory)
write.csv( DailyData, file = "NewNest3_DAILY_Feb2016_Nov2016.dat", row.names = FALSE )
write.table( DailyData, file = "NewNest3_DAILY_Feb2016_Nov2016.xls", row.names = FALSE, sep = "\t" )
#############################################################################################
### OPTIONAL                                                                              ###
### quick plots using {base}                                                              ###
#############################################################################################
###VWC###
plot( DailyData$Date, DailyData$VWC_2cm_avg, type = "l", ylim = c(0,0.6), ylab = "VWC (%)" )
lines( DailyData$Date, DailyData$VWC_8cm_avg, type = "l", col = "red")
lines( DailyData$Date, DailyData$VWC_16cm_avg, type = "l", col = "blue")
lines( DailyData$Date, DailyData$VWC_50cm_avg, type = "l", col = "green")
###Tsoil###
plot( DailyData$Date, DailyData$T_2cm_avg, type = "l", ylim = c(20,28), ylab = "Tsoil (deg C)" )
lines( DailyData$Date, DailyData$T_8cm_avg, type = "l", col = "red")
lines( DailyData$Date, DailyData$T_16cm_avg, type = "l", col = "blue")
lines( DailyData$Date, DailyData$T_50cm_avg, type = "l", col = "green")
###O2###
###O2###
plot( DailyData$Date, DailyData$O2_1_kPa_avg, type = "l", ylim = c(0,30), ylab = "O2 (kPa)" )
lines( DailyData$Date, DailyData$O2_2_kPa_avg, type = "l", col = "red")
lines( DailyData$Date, DailyData$O2_3_kPa_avg, type = "l", col = "blue")
lines( DailyData$Date, DailyData$O2_4_kPa_avg, type = "l", col = "green")
lines( DailyData$Date, DailyData$VWC_2cm_avg*25, type = "l", col = "magenta" )
plot( DailyData$Date, DailyData$O2_1_kPa_avg, type = "l", ylim = c(0,30), ylab = "O2 (kPa)" )
lines( DailyData$Date, DailyData$O2_2_kPa_avg, type = "l", col = "red")
lines( DailyData$Date, DailyData$O2_3_kPa_avg, type = "l", col = "blue")
lines( DailyData$Date, DailyData$O2_4_kPa_avg, type = "l", col = "green")
lines( DailyData$Date, DailyData$VWC_2cm_avg*25, type = "l", col = "magenta" )
lines( DailyData$Date, DailyData$VWC_2cm_avg*250 - 85, type = "l", col = "magenta" )
plot( DailyData$Date, DailyData$O2_1_kPa_avg, type = "l", ylim = c(0,30), ylab = "O2 (kPa)" )
plot( DailyData$Date, DailyData$O2_1_kPa_avg, type = "l", ylim = c(0,30), ylab = "O2 (kPa)" )
lines( DailyData$Date, DailyData$O2_2_kPa_avg, type = "l", col = "red")
lines( DailyData$Date, DailyData$O2_3_kPa_avg, type = "l", col = "blue")
lines( DailyData$Date, DailyData$O2_4_kPa_avg, type = "l", col = "green")
lines( DailyData$Date, DailyData$VWC_2cm_avg*250 - 85, type = "l", col = "magenta" )
###CO2###
plot( DailyData$Date, DailyData$CO2_1_avg, type = "l", ylim = c(0,5), ylab = "CO2 (%)" )
lines( DailyData$Date, DailyData$CO2_2_avg, type = "l", col = "red")
lines( DailyData$Date, DailyData$CO2_3_avg, type = "l", col = "blue")
lines( DailyData$Date, DailyData$CO2_4_avg, type = "l", col = "green")
